Imports

In [1]:
import warnings
warnings.filterwarnings('ignore')
In [2]:
# Data manipulation libraries
import numpy as np
import pandas as pd

# Visualization libraries
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

# Avoid Warnings
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
In [3]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')
In [4]:
print(df.shape)
df.head()
(5110, 12)
Out[4]:
id gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
0 9046 Male 67.0 0 1 Yes Private Urban 228.69 36.6 formerly smoked 1
1 51676 Female 61.0 0 0 Yes Self-employed Rural 202.21 NaN never smoked 1
2 31112 Male 80.0 0 1 Yes Private Rural 105.92 32.5 never smoked 1
3 60182 Female 49.0 0 0 Yes Private Urban 171.23 34.4 smokes 1
4 1665 Female 79.0 1 0 Yes Self-employed Rural 174.12 24.0 never smoked 1

Dropping the id column as it's just an identifier

In [5]:
df.drop(['id'],axis=1,inplace=True)

Checking for missing data

In [6]:
#count of missing data
missing_values_count = df.isna().sum()

#find the percentage of missing data
total_cells = np.product(df.shape)
total_missing = missing_values_count.sum()
percent_missing = (total_missing / total_cells) * 100
print("Percentage of missing data from the dataset is : {}%".format(percent_missing))
Percentage of missing data from the dataset is : 0.35758761786159043%

Plotting a heatmap to check for missing data features

In [7]:
plt.figure(figsize = (12,6))
sns.heatmap(df.isnull())
plt.show()

Filling the missing data in bmi column with mean

In [8]:
df['bmi'].fillna(df['bmi'].mean(), inplace=True)

Univariate analysis

In [9]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                5110 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 439.3+ KB

Making different arrays for categorical and continuous features

In [10]:
cat_cols = ["gender","hypertension","heart_disease","ever_married","work_type","Residence_type","smoking_status","stroke"]
cont_cols = ["age","avg_glucose_level","bmi"]

Count Plot of Categorical features

In [11]:
fig,axes = plt.subplots(4,2,figsize = (16,16))
sns.set_style('darkgrid')
fig.suptitle("Count plot for various categorical features")

sns.countplot(ax=axes[0,0],data=df,x='gender')
sns.countplot(ax=axes[0,1],data=df,x='hypertension')
sns.countplot(ax=axes[1,0],data=df,x='heart_disease')
sns.countplot(ax=axes[1,1],data=df,x='ever_married')
sns.countplot(ax=axes[2,0],data=df,x='work_type')
sns.countplot(ax=axes[2,1],data=df,x='Residence_type')
sns.countplot(ax=axes[3,0],data=df,x='smoking_status')
sns.countplot(ax=axes[3,1],data=df,x='stroke')

plt.show()

Box Plot of age

In [12]:
fig = px.box(data_frame = df,
            x = "age",
            width = 800,
            height = 300)
fig.update_layout({"template":"plotly_dark"})
fig.show()

Box Plot of avg_glucose_level

In [13]:
fig = px.box(data_frame = df,
            x = "avg_glucose_level",
            width = 800,
            height = 300)
fig.update_layout({"template":"plotly_dark"})
fig.show()

Box Plot of bmi

In [14]:
fig = px.box(data_frame = df,
            x = "bmi",
            width = 800,
            height = 300)
fig.update_layout({"template":"plotly_dark"})
fig.show()

Distribution Plot of age

In [15]:
age = list(df['age'].values)

hist_data = [age]
group_labels = ['age']
colors = ['Orange']
fig = ff.create_distplot(hist_data,group_labels,show_hist = True,colors=colors)
fig.update_layout({"template":"plotly_dark"})
fig.show()

Distribution Plot of avg_glucose_level

In [16]:
avg_glucose_level = list(df['avg_glucose_level'].values)
hist_data = [avg_glucose_level]
group_labels = ['avg_glucose_level']
colors = ['Orange']
fig = ff.create_distplot(hist_data,group_labels,show_hist = True,colors=colors)
fig.update_layout({"template":"plotly_dark"})
fig.show()

Distribution Plot of bmi

In [17]:
bmi = list(df['bmi'].values)
hist_data = [bmi]
group_labels = ["bmi"]
colors = ['Orange']
fig = ff.create_distplot(hist_data,group_labels,show_hist = True,colors=colors)
fig.update_layout({"template":"plotly_dark"})
fig.show()

Bivariate analysis

In [18]:
cat_cols = ["gender","hypertension","heart_disease","ever_married","work_type","Residence_type","smoking_status","stroke"]
cont_cols = ["age","avg_glucose_level","bmi"]

Correlation plot of Continuous features

In [19]:
cr = df[cont_cols].corr(method='pearson')
plt.figure(figsize = (6,6))
sns.heatmap(cr,cmap="coolwarm")
plt.show()

Scatter plot for age vs avg_glucose_level with a Stroke hue

In [20]:
plt.figure(figsize=(8,8))
sns.set_style("darkgrid")
sns.scatterplot(data = df, x = 'age', y = 'avg_glucose_level', hue='stroke')
plt.show()

Scatter plot for avg_glucose_level vs bmi with a Stroke hue

In [21]:
plt.figure(figsize=(8,8))
sns.set_style("darkgrid")
sns.scatterplot(data = df, x = 'avg_glucose_level', y = 'bmi', hue='stroke')
plt.show()

Scatter plot for age vs bmi with a Stroke hue

In [22]:
plt.figure(figsize=(8,8))
sns.set_style("darkgrid")
sns.scatterplot(data = df, x = 'age', y = 'bmi', hue='stroke')
plt.show()

Violin plot for continuous features

In [23]:
plt.figure(figsize=(16,6))
plt.subplot(1,3,1)
sns.violinplot(x = 'stroke', y = 'age', data = df)
plt.subplot(1,3,2)
sns.violinplot(x = 'stroke', y = 'avg_glucose_level', data = df)
plt.subplot(1,3,3)
sns.violinplot(x = 'stroke', y = 'bmi', data = df)
plt.show()

Scatter-matrix of the dataset

In [24]:
plt.figure(figsize = (16,16))
sns.pairplot(df,hue='stroke')
plt.show()
<Figure size 1152x1152 with 0 Axes>

Data preprocessing

In [25]:
df["gender"].value_counts()
Out[25]:
Female    2994
Male      2115
Other        1
Name: gender, dtype: int64
In [26]:
df.drop(df[df['gender'] == 'Other'].index, inplace = True)
df["gender"].value_counts()
Out[26]:
Female    2994
Male      2115
Name: gender, dtype: int64

Checking the effect of outliers on the reduction of dataset

In [27]:
print("The number of people who don't have stroke : ", df['stroke'].value_counts()[0])
print("The number of people who don't have stroke : ", df['stroke'].value_counts()[1])
cond1 = df['avg_glucose_level'] > 170
cond2 = df['stroke'] == 1
print("The number of outliers in avg_glucose_level with stroke = 1 are : ", df[cond1 & cond2].shape)
cond3 = df['bmi'] > 47
cond4 = df['stroke'] == 1
print("The number of outliers in bmi with stroke = 1 are : ", df[cond3 & cond4].shape)
The number of people who don't have stroke :  4860
The number of people who don't have stroke :  249
The number of outliers in avg_glucose_level with stroke = 1 are :  (83, 11)
The number of outliers in bmi with stroke = 1 are :  (3, 11)
In [28]:
print("The shape before removing the BMI outliers : ",df.shape)
df.drop(df[df['bmi'] > 47].index, inplace = True)
print("The shape after removing the BMI outliers : ",df.shape)
The shape before removing the BMI outliers :  (5109, 11)
The shape after removing the BMI outliers :  (4992, 11)
In [29]:
plt.figure(figsize = (14,5))
sns.distplot(x=df['bmi'],color='red')
plt.show()
In [30]:
df.dtypes
Out[30]:
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object
In [31]:
# Label Encoding the categorical variables
from sklearn.preprocessing import LabelEncoder
object_cols = ["gender","ever_married","work_type","Residence_type","smoking_status"]
label_encoder = LabelEncoder()
for col in object_cols:
    label_encoder.fit(df[col])
    df[col] = label_encoder.transform(df[col])

SMOTE

In [32]:
# Using SMOTE
from imblearn.over_sampling import SMOTE
sampler = SMOTE(random_state = 42)
X = df.drop(['stroke'],axis=1)
y = df[['stroke']]
X,y= sampler.fit_resample(X,y['stroke'].values.ravel())
y = pd.DataFrame({'stroke':y})
sns.countplot(data = y, x = 'stroke', y= None)
plt.show()
In [33]:
# Joining back dataset
df = pd.concat([X,y],axis = 1)
df.head()
Out[33]:
gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
0 1 67.0 0 1 1 2 1 228.69 36.600000 1 1
1 0 61.0 0 0 1 3 0 202.21 28.893237 2 1
2 1 80.0 0 1 1 2 0 105.92 32.500000 2 1
3 0 49.0 0 0 1 2 1 171.23 34.400000 3 1
4 0 79.0 1 0 1 3 0 174.12 24.000000 2 1
In [34]:
# shuffling the dataset before model development
df = df.sample(frac = 1)

Hybrid Deep TL

In [35]:
import torch
import torch.nn as nn
In [36]:
cat_cols = ["gender","hypertension","heart_disease","ever_married","work_type","Residence_type","smoking_status"]
cont_cols = ["age","avg_glucose_level","bmi"]
y_col = ["stroke"]
In [37]:
for cat in cat_cols:
    df[cat] = df[cat].astype('category')
In [38]:
df.dtypes
Out[38]:
gender               category
age                   float64
hypertension         category
heart_disease        category
ever_married         category
work_type            category
Residence_type       category
avg_glucose_level     float64
bmi                   float64
smoking_status       category
stroke                  int64
dtype: object
In [39]:
# stacking the categorical columns
cats = np.stack([df[col].cat.codes.values for col in cat_cols], 1)
cats[:5]
Out[39]:
array([[0, 0, 0, 1, 3, 0, 2],
       [0, 0, 0, 0, 4, 0, 0],
       [0, 0, 0, 0, 2, 1, 2],
       [1, 0, 0, 0, 4, 0, 0],
       [0, 0, 0, 0, 4, 0, 0]], dtype=int8)
In [40]:
# converting the stack into tensor
cats = torch.tensor(cats, dtype = torch.int64)
cats[:5]
Out[40]:
tensor([[0, 0, 0, 1, 3, 0, 2],
        [0, 0, 0, 0, 4, 0, 0],
        [0, 0, 0, 0, 2, 1, 2],
        [1, 0, 0, 0, 4, 0, 0],
        [0, 0, 0, 0, 4, 0, 0]])
In [41]:
# stacking the continuous columns & converting to tensor
conts = np.stack([df[col].values for col in cont_cols], 1)
conts = torch.tensor(conts, dtype=torch.float)
conts[:5]
Out[41]:
tensor([[ 78.0000, 109.4700,  30.8000],
        [ 14.0000,  92.2200,  22.8000],
        [ 68.1670,  80.2944,  29.4161],
        [  8.0000, 104.3000,  18.5000],
        [  0.7200,  62.1300,  16.8000]])
In [42]:
# converting target variable to tensor and flattening since CrossEntropyLoss expects a 1-d tensor
y = torch.tensor(df[y_col].values).flatten()
y[:5]
Out[42]:
tensor([1, 0, 1, 0, 0])
In [43]:
print(cats.shape)
print(conts.shape)
print(y.shape)
torch.Size([9492, 7])
torch.Size([9492, 3])
torch.Size([9492])
In [44]:
cat_szs = [len(df[col].cat.categories) for col in cat_cols]
emb_szs = [(size, min(50, (size+1)//2)) for size in cat_szs]
emb_szs
Out[44]:
[(2, 1), (2, 1), (2, 1), (2, 1), (5, 3), (2, 1), (4, 2)]
In [45]:
class HDTL(nn.Module):

    def __init__(self, emb_szs, n_cont, out_sz, layers, p=0.5):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(ni, nf) for ni,nf in emb_szs])
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        
        layerlist = []
        n_emb = sum((nf for ni,nf in emb_szs))
        n_in = n_emb + n_cont
        
        for i in layers:
            layerlist.append(nn.Linear(n_in,i)) 
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
        layerlist.append(nn.Linear(layers[-1],out_sz))
            
        self.layers = nn.Sequential(*layerlist)
    
    def forward(self, x_cat, x_cont):
        embeddings = []
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)
        
        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        x = self.layers(x)
        return x
In [46]:
torch.manual_seed(42)
model = HDTL(emb_szs, conts.shape[1], 2, [400,200,100], p=0.2)
model
Out[46]:
HDTL(
  (embeds): ModuleList(
    (0): Embedding(2, 1)
    (1): Embedding(2, 1)
    (2): Embedding(2, 1)
    (3): Embedding(2, 1)
    (4): Embedding(5, 3)
    (5): Embedding(2, 1)
    (6): Embedding(4, 2)
  )
  (emb_drop): Dropout(p=0.2, inplace=False)
  (bn_cont): BatchNorm1d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=13, out_features=400, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(400, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=400, out_features=200, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.2, inplace=False)
    (8): Linear(in_features=200, out_features=100, bias=True)
    (9): ReLU(inplace=True)
    (10): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): Dropout(p=0.2, inplace=False)
    (12): Linear(in_features=100, out_features=2, bias=True)
  )
)

Bayesian Optimization

In [47]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

Performing train/test split

In [48]:
batch_size = 9000
test_size = 492

cat_train = cats[:batch_size-test_size]
cat_test = cats[batch_size-test_size:batch_size]
con_train = conts[:batch_size-test_size]
con_test = conts[batch_size-test_size:batch_size]
y_train = y[:batch_size-test_size]
y_test = y[batch_size-test_size:batch_size]
In [49]:
print(len(cat_train))
print(len(cat_test))
8508
492

Training the model

In [50]:
import time
start_time = time.time()

epochs = 320
losses = []

for i in range(epochs):
    i+=1
    y_pred = model(cat_train, con_train)
    loss = criterion(y_pred, y_train)
    losses.append(loss)
    
    if i%25 == 1:
        print(f'epoch: {i:3}  loss: {loss.item():10.8f}')

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'epoch: {i:3}  loss: {loss.item():10.8f}') 
print(f'\nDuration: {time.time() - start_time:.0f} seconds') 
epoch:   1  loss: 0.79226971
epoch:  26  loss: 0.39218336
epoch:  51  loss: 0.37174425
epoch:  76  loss: 0.35986394
epoch: 101  loss: 0.34412363
epoch: 126  loss: 0.33559442
epoch: 151  loss: 0.32564360
epoch: 176  loss: 0.31327072
epoch: 201  loss: 0.30764669
epoch: 226  loss: 0.29202586
epoch: 251  loss: 0.28786653
epoch: 276  loss: 0.27623996
epoch: 301  loss: 0.26914999
epoch: 320  loss: 0.25564879

Duration: 22 seconds

Plotting the loss function

In [51]:
plt.plot(range(epochs), losses)
plt.ylabel('Cross Entropy Loss')
plt.xlabel('epoch');

Model Validation

In [52]:
# TO EVALUATE THE ENTIRE TEST SET
with torch.no_grad():
    y_val = model(cat_test, con_test)
    loss = criterion(y_val, y_test)
print(f'CE Loss: {loss:.8f}')
CE Loss: 0.32725507
In [53]:
rows = 200
correct = 0
groundTruth = []
predictedValues = []
print(f'{"MODEL OUTPUT":26} ARGMAX  Y_TEST')
for i in range(rows):
    print(f'{str(y_val[i]):26} {y_val[i].argmax():^7}{y_test[i]:^7}')
    predictedValues.append(y_val[i].argmax().item())
    groundTruth.append(y_test[i])
    if y_val[i].argmax().item() == y_test[i]:
        correct += 1
print(f'\n{correct} out of {rows} = {100*correct/rows:.2f}% correct')
MODEL OUTPUT               ARGMAX  Y_TEST
tensor([-3.6648,  4.0368])    1      1   
tensor([ 0.5525, -0.1474])    0      1   
tensor([ 1.6007, -1.9164])    0      0   
tensor([ 0.9837, -0.6090])    0      1   
tensor([-0.7187,  1.6396])    1      1   
tensor([-5.2345,  5.8858])    1      1   
tensor([-2.6524,  2.6704])    1      1   
tensor([-1.2360,  1.6339])    1      1   
tensor([-0.4849,  0.4666])    1      0   
tensor([ 5.3401, -4.3897])    0      0   
tensor([-5.2090,  6.0244])    1      1   
tensor([ 4.6090, -5.9303])    0      0   
tensor([-0.1651,  0.5062])    1      0   
tensor([ 4.5006, -4.2582])    0      0   
tensor([ 3.9587, -5.6043])    0      0   
tensor([-2.1463,  1.1674])    1      1   
tensor([ 1.0625, -0.5354])    0      0   
tensor([-1.6575,  1.6055])    1      1   
tensor([ 3.6795, -3.3151])    0      0   
tensor([ 5.2959, -3.9359])    0      0   
tensor([ 0.9864, -0.1980])    0      0   
tensor([-1.5342,  1.1621])    1      1   
tensor([-0.3788,  0.6989])    1      1   
tensor([-1.5084,  1.7176])    1      1   
tensor([-1.8059,  1.4850])    1      1   
tensor([ 1.2848, -0.1600])    0      1   
tensor([-3.9547,  4.4003])    1      1   
tensor([ 3.3475, -4.7056])    0      0   
tensor([0.4064, 0.3733])      0      0   
tensor([ 2.9622, -4.1840])    0      0   
tensor([0.3298, 1.3530])      1      0   
tensor([-0.9158,  0.7533])    1      1   
tensor([-1.5323,  1.0711])    1      1   
tensor([-1.2045,  0.7822])    1      1   
tensor([ 8.9380, -9.5957])    0      0   
tensor([ 2.3522, -3.0694])    0      0   
tensor([ 1.3868, -0.9786])    0      1   
tensor([-1.4608,  1.2464])    1      1   
tensor([-1.0163,  1.2214])    1      1   
tensor([-0.6602,  0.6280])    1      0   
tensor([-0.9138,  1.7076])    1      1   
tensor([ 3.9451, -3.3443])    0      0   
tensor([-1.2441,  0.7437])    1      1   
tensor([-1.2301,  1.1804])    1      1   
tensor([ 1.8433, -2.4643])    0      0   
tensor([-5.8106,  7.0438])    1      1   
tensor([ 4.1980, -4.6885])    0      0   
tensor([-1.0257,  1.0111])    1      1   
tensor([ 5.4107, -7.7384])    0      0   
tensor([-2.1996,  2.4064])    1      1   
tensor([ 5.8648, -6.4721])    0      0   
tensor([-1.1959,  1.6832])    1      1   
tensor([ 1.6985, -1.7254])    0      0   
tensor([-1.4682,  1.3221])    1      1   
tensor([-6.2841,  6.4228])    1      1   
tensor([-0.9477,  0.7935])    1      1   
tensor([ 2.3889, -4.6619])    0      0   
tensor([-2.3440,  2.7245])    1      1   
tensor([ 2.7221, -2.6074])    0      0   
tensor([-0.8304,  0.2186])    1      1   
tensor([-0.2663,  1.0981])    1      1   
tensor([0.4438, 0.4453])      1      0   
tensor([ 0.4363, -0.1028])    0      0   
tensor([-0.8738,  1.7131])    1      1   
tensor([-1.6801,  2.5242])    1      1   
tensor([-0.6547,  1.0151])    1      1   
tensor([ 3.6236, -5.7363])    0      0   
tensor([ 5.1524, -6.9146])    0      0   
tensor([ 1.5192, -1.0500])    0      1   
tensor([-0.1966,  0.8923])    1      1   
tensor([-0.8802,  1.2202])    1      1   
tensor([0.1375, 0.1858])      1      0   
tensor([ 3.8047, -4.9615])    0      0   
tensor([ 2.5116, -1.4940])    0      0   
tensor([ 3.9883, -4.0659])    0      0   
tensor([-1.8746,  2.0217])    1      1   
tensor([ 8.3304, -8.2680])    0      0   
tensor([0.5433, 0.8406])      1      1   
tensor([ 0.4709, -0.2515])    0      1   
tensor([0.4323, 0.7232])      1      0   
tensor([-1.5649,  1.3329])    1      1   
tensor([ 6.5437, -4.5028])    0      0   
tensor([-1.9240,  2.5169])    1      1   
tensor([ 6.5426, -7.3606])    0      0   
tensor([-0.6632,  0.8885])    1      1   
tensor([-0.1710,  0.3433])    1      1   
tensor([ 3.0761, -1.5805])    0      0   
tensor([0.1052, 0.8374])      1      1   
tensor([ 3.4001, -3.2248])    0      0   
tensor([-1.9522,  1.5716])    1      1   
tensor([-0.4040,  0.5958])    1      1   
tensor([ 4.1697, -4.8335])    0      0   
tensor([-1.3107,  1.5069])    1      1   
tensor([-0.8783,  2.4468])    1      1   
tensor([-1.3189,  1.6357])    1      1   
tensor([0.9257, 0.2318])      0      0   
tensor([0.5621, 0.0447])      0      0   
tensor([-1.3827,  0.5372])    1      1   
tensor([0.3075, 0.2312])      0      0   
tensor([-1.0124,  0.8308])    1      1   
tensor([ 3.5640, -4.5364])    0      0   
tensor([ 1.4784, -0.8466])    0      0   
tensor([ 4.3377, -4.1888])    0      0   
tensor([-1.8108,  1.8736])    1      1   
tensor([-0.4033,  1.3654])    1      1   
tensor([ 1.5580, -1.1016])    0      0   
tensor([0.2006, 0.3515])      1      0   
tensor([-1.0733,  1.1660])    1      1   
tensor([ 1.9046, -1.6148])    0      1   
tensor([-0.3729,  0.1084])    1      0   
tensor([-0.1366,  0.9372])    1      0   
tensor([-1.5689,  1.8331])    1      1   
tensor([0.2788, 0.7369])      1      1   
tensor([-6.0764,  4.5331])    1      1   
tensor([ 3.5175, -6.7655])    0      0   
tensor([ 3.8597, -2.8739])    0      0   
tensor([-0.2402, -0.2794])    0      0   
tensor([-1.0959,  0.9929])    1      1   
tensor([-0.2413,  0.4179])    1      0   
tensor([ 1.1656, -1.0662])    0      0   
tensor([-0.1075,  0.0027])    1      0   
tensor([-1.8770,  2.4125])    1      1   
tensor([ 2.4181, -2.6462])    0      0   
tensor([-0.7957,  0.1900])    1      1   
tensor([-1.9950,  1.1096])    1      1   
tensor([-1.2054,  0.8788])    1      1   
tensor([-0.4667,  1.1114])    1      1   
tensor([-0.8549,  0.6311])    1      1   
tensor([-3.1756,  2.2294])    1      1   
tensor([-4.3818,  3.6066])    1      1   
tensor([ 3.3328, -2.8117])    0      0   
tensor([-0.1980,  0.7457])    1      1   
tensor([ 5.9658, -6.0972])    0      0   
tensor([-1.1034,  1.8934])    1      1   
tensor([ 0.5215, -0.3756])    0      1   
tensor([0.3951, 1.0431])      1      0   
tensor([-0.0637, -0.9744])    0      1   
tensor([ 3.3556, -2.5069])    0      0   
tensor([-0.1924, -0.1568])    1      1   
tensor([ 3.0890, -5.9132])    0      0   
tensor([ 4.0710, -5.7962])    0      0   
tensor([ 1.7681, -1.3279])    0      1   
tensor([ 4.6631, -4.4335])    0      0   
tensor([ 3.0850, -1.9125])    0      0   
tensor([0.2304, 0.3478])      1      1   
tensor([ 1.2951, -1.7110])    0      0   
tensor([0.8000, 0.7736])      0      1   
tensor([ 3.3829, -4.5694])    0      0   
tensor([-0.5821,  1.4094])    1      1   
tensor([ 0.1916, -0.9665])    0      0   
tensor([ 4.7643, -4.1319])    0      0   
tensor([ 7.1923, -7.1476])    0      0   
tensor([-0.0370, -0.4505])    0      0   
tensor([ 3.4430, -4.0568])    0      0   
tensor([ 3.1201, -2.2473])    0      0   
tensor([ 2.4983, -4.5766])    0      0   
tensor([-1.1215,  1.7999])    1      1   
tensor([ 1.4295, -0.9601])    0      0   
tensor([-1.1453,  2.0457])    1      0   
tensor([0.0194, 0.8131])      1      1   
tensor([ 6.5679, -8.6103])    0      0   
tensor([-0.0178,  0.3755])    1      0   
tensor([-1.4104,  1.3232])    1      1   
tensor([ 3.9677, -6.4341])    0      0   
tensor([ 4.2109, -1.9789])    0      0   
tensor([-2.4206,  1.7894])    1      1   
tensor([-0.2671,  0.4088])    1      1   
tensor([-0.9420,  0.5966])    1      0   
tensor([0.4905, 0.3423])      0      0   
tensor([ 5.8325, -6.0776])    0      0   
tensor([ 2.1670, -1.1498])    0      0   
tensor([-0.2752,  0.7603])    1      1   
tensor([-1.7617,  1.3504])    1      1   
tensor([ 1.3946, -1.7937])    0      0   
tensor([ 1.2897, -0.0989])    0      0   
tensor([ 0.8426, -0.9220])    0      0   
tensor([-1.9306,  2.1045])    1      1   
tensor([-1.7913,  2.5087])    1      1   
tensor([-2.0754,  2.3230])    1      1   
tensor([-1.8945,  2.1555])    1      1   
tensor([-0.1029,  0.5511])    1      0   
tensor([ 1.7915, -2.6741])    0      0   
tensor([-1.2695,  1.1959])    1      1   
tensor([-1.9812,  1.6638])    1      1   
tensor([-5.4824,  4.7499])    1      1   
tensor([-2.0257,  1.4174])    1      1   
tensor([ 4.0881, -3.9895])    0      0   
tensor([-0.4289,  1.3270])    1      1   
tensor([-1.6243,  1.4639])    1      0   
tensor([0.0518, 0.4653])      1      0   
tensor([ 2.4718, -3.4201])    0      0   
tensor([ 7.9458, -8.8197])    0      0   
tensor([ 0.9526, -0.8556])    0      1   
tensor([0.0436, 0.2219])      1      1   
tensor([-3.1774,  3.6421])    1      1   
tensor([0.1060, 0.4087])      1      1   
tensor([ 0.9797, -0.5827])    0      0   
tensor([ 5.0364, -6.8716])    0      0   
tensor([ 1.2583, -1.9381])    0      0   
tensor([-0.1477,  0.4942])    1      1   

169 out of 200 = 84.50% correct
In [54]:
from sklearn.metrics import f1_score, recall_score,precision_score,accuracy_score
hdtl_f1 = f1_score(groundTruth, predictedValues)
hdtl_rec = recall_score(groundTruth, predictedValues)
hdtl_prec = precision_score(groundTruth, predictedValues)
hdtl_acc = accuracy_score(groundTruth, predictedValues)
In [55]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(groundTruth, predictedValues, pos_label=2)
hdtl_auc = metrics.auc(fpr, tpr)

SGD Optimization

In [56]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
In [57]:
import time
start_time = time.time()

epochs = 320
losses = []

for i in range(epochs):
    i+=1
    y_pred = model(cat_train, con_train)
    loss = criterion(y_pred, y_train)
    losses.append(loss)
    
    if i%25 == 1:
        print(f'epoch: {i:3}  loss: {loss.item():10.8f}')

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'epoch: {i:3}  loss: {loss.item():10.8f}') 
print(f'\nDuration: {time.time() - start_time:.0f} seconds') 
epoch:   1  loss: 0.26735392
epoch:  26  loss: 0.26459032
epoch:  51  loss: 0.26126590
epoch:  76  loss: 0.26178080
epoch: 101  loss: 0.25542626
epoch: 126  loss: 0.25579631
epoch: 151  loss: 0.26520121
epoch: 176  loss: 0.26248160
epoch: 201  loss: 0.26555145
epoch: 226  loss: 0.25964329
epoch: 251  loss: 0.26209179
epoch: 276  loss: 0.26341340
epoch: 301  loss: 0.25709203
epoch: 320  loss: 0.26749557

Duration: 21 seconds
In [58]:
plt.plot(range(epochs), losses)
plt.ylabel('Cross Entropy Loss')
plt.xlabel('epoch');
In [59]:
# TO EVALUATE THE ENTIRE TEST SET
with torch.no_grad():
    y_val = model(cat_test, con_test)
    loss = criterion(y_val, y_test)
print(f'CE Loss: {loss:.8f}')
CE Loss: 0.32895464
In [60]:
rows = 200
correct = 0
groundTruth = []
predictedValues = []
print(f'{"MODEL OUTPUT":26} ARGMAX  Y_TEST')
for i in range(rows):
    print(f'{str(y_val[i]):26} {y_val[i].argmax():^7}{y_test[i]:^7}')
    predictedValues.append(y_val[i].argmax().item())
    groundTruth.append(y_test[i])
    if y_val[i].argmax().item() == y_test[i]:
        correct += 1
print(f'\n{correct} out of {rows} = {100*correct/rows:.2f}% correct')
MODEL OUTPUT               ARGMAX  Y_TEST
tensor([-5.0750,  4.9112])    1      1   
tensor([-4.1945,  4.7308])    1      1   
tensor([ 3.1095, -4.1844])    0      0   
tensor([ 0.2272, -1.5439])    0      1   
tensor([-0.3540,  1.1996])    1      1   
tensor([-4.5905,  5.0157])    1      1   
tensor([-7.5779,  8.3397])    1      1   
tensor([-2.1276,  2.5307])    1      1   
tensor([-0.8509,  1.2689])    1      0   
tensor([ 5.2962, -4.2805])    0      0   
tensor([-2.3378,  3.1315])    1      1   
tensor([ 3.2385, -2.9979])    0      0   
tensor([-0.4753,  0.2905])    1      0   
tensor([ 6.8418, -6.1773])    0      0   
tensor([ 5.1587, -8.2113])    0      0   
tensor([-1.8879,  1.7372])    1      1   
tensor([ 1.0371, -0.5777])    0      0   
tensor([-0.5108,  0.7823])    1      1   
tensor([ 4.7327, -2.9755])    0      0   
tensor([ 3.8440, -2.0594])    0      0   
tensor([-0.4240,  1.4826])    1      0   
tensor([-1.1850,  1.1182])    1      1   
tensor([-1.3883,  0.9878])    1      1   
tensor([-0.9922,  1.7363])    1      1   
tensor([-0.0618,  0.3592])    1      1   
tensor([ 0.4657, -0.0243])    0      1   
tensor([-0.4097,  1.7591])    1      1   
tensor([ 5.0335, -6.6843])    0      0   
tensor([-0.1091,  0.1928])    1      0   
tensor([ 2.8669, -3.1803])    0      0   
tensor([ 0.1201, -0.0292])    0      0   
tensor([-2.3087,  1.5266])    1      1   
tensor([-2.4192,  2.0289])    1      1   
tensor([-0.7538,  1.1741])    1      1   
tensor([ 7.6208, -9.7373])    0      0   
tensor([ 1.0400, -1.4044])    0      0   
tensor([-1.2897,  1.2972])    1      1   
tensor([0.0542, 0.6006])      1      1   
tensor([ 0.5651, -0.3325])    0      1   
tensor([0.0345, 0.2332])      1      0   
tensor([-0.8071,  1.1071])    1      1   
tensor([ 2.9491, -1.9447])    0      0   
tensor([-0.8730,  0.0419])    1      1   
tensor([-1.8086,  1.0156])    1      1   
tensor([ 2.0558, -2.9353])    0      0   
tensor([-6.1399,  5.0340])    1      1   
tensor([ 6.2596, -8.9888])    0      0   
tensor([-1.1757,  0.7963])    1      1   
tensor([ 5.9485, -6.1133])    0      0   
tensor([-2.5615,  1.4409])    1      1   
tensor([ 6.0174, -5.6354])    0      0   
tensor([-1.1671,  1.2700])    1      1   
tensor([-0.4974, -0.4480])    1      0   
tensor([-1.8095,  1.4435])    1      1   
tensor([0.3842, 0.2534])      0      1   
tensor([-0.8999, -0.4249])    1      1   
tensor([ 3.2695, -5.2883])    0      0   
tensor([-1.0281,  1.1916])    1      1   
tensor([ 2.0561, -1.1441])    0      0   
tensor([-1.4398,  1.5194])    1      1   
tensor([-0.9690,  2.3325])    1      1   
tensor([1.1615, 0.0934])      0      0   
tensor([ 1.0072, -1.4080])    0      0   
tensor([-1.1972,  0.9210])    1      1   
tensor([-2.1213,  2.0258])    1      1   
tensor([-1.0513,  1.1365])    1      1   
tensor([ 1.6846, -2.4777])    0      0   
tensor([ 5.8066, -7.4331])    0      0   
tensor([-0.3250,  0.2763])    1      1   
tensor([-0.9304,  1.4151])    1      1   
tensor([-0.9901,  1.6936])    1      1   
tensor([-0.0785,  0.6753])    1      0   
tensor([ 4.1868, -4.8091])    0      0   
tensor([ 2.5553, -1.9617])    0      0   
tensor([ 2.9425, -2.6002])    0      0   
tensor([-0.6627,  0.5056])    1      1   
tensor([ 5.6564, -7.9449])    0      0   
tensor([-0.4370,  1.5107])    1      1   
tensor([-0.3403,  0.4396])    1      1   
tensor([-0.1495,  1.0296])    1      0   
tensor([-2.2328,  1.4957])    1      1   
tensor([ 3.3262, -2.4104])    0      0   
tensor([-1.2391,  2.1810])    1      1   
tensor([ 4.1327, -6.8829])    0      0   
tensor([ 0.4394, -0.3293])    0      1   
tensor([-0.9990,  0.6815])    1      1   
tensor([ 2.5859, -1.7037])    0      0   
tensor([-0.1717,  1.2219])    1      1   
tensor([ 2.9835, -4.4195])    0      0   
tensor([-1.7905,  1.6153])    1      1   
tensor([ 1.4798, -1.1809])    0      1   
tensor([ 2.6663, -3.1021])    0      0   
tensor([-1.1224,  0.9623])    1      1   
tensor([-0.4185,  0.4469])    1      1   
tensor([0.1048, 0.8333])      1      1   
tensor([ 0.5395, -0.8170])    0      0   
tensor([ 0.7043, -0.2097])    0      0   
tensor([-1.3800,  1.5582])    1      1   
tensor([ 0.7567, -0.8437])    0      0   
tensor([-0.8060,  0.7732])    1      1   
tensor([ 5.0572, -5.1888])    0      0   
tensor([ 1.4154, -1.1541])    0      0   
tensor([ 5.2645, -4.7972])    0      0   
tensor([-3.2887,  2.1736])    1      1   
tensor([-0.1599,  1.5518])    1      1   
tensor([ 1.6266, -1.4443])    0      0   
tensor([-0.7341,  0.0467])    1      0   
tensor([-2.2280,  1.4283])    1      1   
tensor([ 1.9095, -1.2476])    0      1   
tensor([-0.8668,  0.7074])    1      0   
tensor([-0.0427,  0.4119])    1      0   
tensor([-2.3771,  1.7142])    1      1   
tensor([-0.2017,  0.0332])    1      1   
tensor([-1.2727,  2.2465])    1      1   
tensor([ 1.7168, -2.7742])    0      0   
tensor([ 3.3168, -2.8698])    0      0   
tensor([0.5383, 0.6530])      1      0   
tensor([0.0445, 0.0318])      0      1   
tensor([-0.0528, -0.0853])    0      0   
tensor([ 1.2339, -0.9614])    0      0   
tensor([ 0.2150, -0.6705])    0      0   
tensor([-0.6161,  0.7418])    1      1   
tensor([ 1.5283, -1.9728])    0      0   
tensor([-1.0951,  2.3360])    1      1   
tensor([-1.9104,  2.1205])    1      1   
tensor([-2.2559,  1.8020])    1      1   
tensor([-0.8534,  2.7272])    1      1   
tensor([-1.5003,  1.7523])    1      1   
tensor([-1.3720,  1.3088])    1      1   
tensor([-8.3279,  6.9531])    1      1   
tensor([ 2.5529, -2.9026])    0      0   
tensor([-0.0184,  0.7474])    1      1   
tensor([ 4.1325, -7.1210])    0      0   
tensor([-0.8982,  1.3050])    1      1   
tensor([-1.0028,  1.0746])    1      1   
tensor([0.3324, 0.6896])      1      0   
tensor([-1.7045, -0.1169])    1      1   
tensor([ 3.9459, -2.1656])    0      0   
tensor([ 0.2970, -0.7280])    0      1   
tensor([ 1.9454, -3.4869])    0      0   
tensor([ 3.3503, -4.5468])    0      0   
tensor([ 0.8585, -0.0359])    0      1   
tensor([ 2.9554, -3.3804])    0      0   
tensor([ 2.1700, -1.1742])    0      0   
tensor([ 0.1255, -0.0486])    0      1   
tensor([ 4.5039, -4.0371])    0      0   
tensor([ 0.3348, -0.5644])    0      1   
tensor([ 6.3335, -5.3751])    0      0   
tensor([-0.7974,  1.3594])    1      1   
tensor([-0.4821,  0.3973])    1      0   
tensor([ 4.1462, -4.5060])    0      0   
tensor([ 6.7035, -5.6386])    0      0   
tensor([-0.8741,  0.8184])    1      0   
tensor([ 2.0286, -2.7466])    0      0   
tensor([ 3.3049, -2.9409])    0      0   
tensor([ 3.6796, -5.8777])    0      0   
tensor([-0.9365,  1.3300])    1      1   
tensor([ 1.8219, -0.9735])    0      0   
tensor([ 0.9921, -0.4714])    0      0   
tensor([-4.7363,  6.8727])    1      1   
tensor([ 5.3614, -7.0203])    0      0   
tensor([0.4679, 0.5823])      1      0   
tensor([-1.7403,  1.5753])    1      1   
tensor([ 5.0142, -6.7968])    0      0   
tensor([ 3.4618, -1.6489])    0      0   
tensor([-1.5497,  1.3662])    1      1   
tensor([-0.0328,  0.1329])    1      1   
tensor([-0.4638,  0.2468])    1      0   
tensor([-1.1858,  1.2393])    1      0   
tensor([ 6.2641, -5.4065])    0      0   
tensor([0.7551, 0.2519])      0      0   
tensor([-1.6268,  0.9807])    1      1   
tensor([-1.5680,  1.2433])    1      1   
tensor([ 1.8622, -1.3814])    0      0   
tensor([ 0.8869, -0.1834])    0      0   
tensor([ 1.5710, -2.4294])    0      0   
tensor([-1.6576,  2.7140])    1      1   
tensor([-2.3535,  3.0772])    1      1   
tensor([-1.6963,  2.0775])    1      1   
tensor([-1.5316,  1.4712])    1      1   
tensor([-0.4923,  0.5039])    1      0   
tensor([ 1.3076, -2.5478])    0      0   
tensor([-0.5500,  0.2588])    1      1   
tensor([-1.4290,  1.4950])    1      1   
tensor([-2.5064,  2.6599])    1      1   
tensor([-1.0176,  1.8349])    1      1   
tensor([ 1.6849, -2.0180])    0      0   
tensor([-0.2094,  0.9011])    1      1   
tensor([-0.6917,  0.7112])    1      0   
tensor([0.3759, 0.7176])      1      0   
tensor([ 3.6840, -3.9236])    0      0   
tensor([ 8.3889, -8.8212])    0      0   
tensor([-0.9466,  0.2461])    1      1   
tensor([0.2565, 0.0997])      0      1   
tensor([-3.4258,  4.0348])    1      1   
tensor([-0.1350,  0.4296])    1      1   
tensor([ 0.4549, -0.6308])    0      0   
tensor([ 5.9172, -7.7422])    0      0   
tensor([ 1.9885, -1.7673])    0      0   
tensor([-0.9038,  1.1802])    1      1   

166 out of 200 = 83.00% correct
In [61]:
from sklearn.metrics import f1_score, recall_score,precision_score,accuracy_score
hdtl1_f1 = f1_score(groundTruth, predictedValues)
hdtl1_rec = recall_score(groundTruth, predictedValues)
hdtl1_prec = precision_score(groundTruth, predictedValues)
hdtl1_acc = accuracy_score(groundTruth, predictedValues)

from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(groundTruth, predictedValues, pos_label=2)
hdtl1_auc = metrics.auc(fpr, tpr)

For DNN

In [62]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import Normalizer
from tensorflow.keras.layers import Activation, Dense, Dropout, BatchNormalization, Input
from keras.models import Model
from keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
%matplotlib inline
plt.style.use('fivethirtyeight')
Using TensorFlow backend.
In [63]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')
df.drop(['id'],axis=1,inplace=True)
In [64]:
df.head()
Out[64]:
gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
0 Male 67.0 0 1 Yes Private Urban 228.69 36.6 formerly smoked 1
1 Female 61.0 0 0 Yes Self-employed Rural 202.21 NaN never smoked 1
2 Male 80.0 0 1 Yes Private Rural 105.92 32.5 never smoked 1
3 Female 49.0 0 0 Yes Private Urban 171.23 34.4 smokes 1
4 Female 79.0 1 0 Yes Self-employed Rural 174.12 24.0 never smoked 1
In [65]:
df['bmi'].fillna(df['bmi'].mean(), inplace=True)
In [66]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                5110 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 439.3+ KB
In [67]:
from sklearn import preprocessing
  
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column 'species'.
df['gender']= label_encoder.fit_transform(df['gender'])
df['ever_married']= label_encoder.fit_transform(df['ever_married'])
df['work_type']= label_encoder.fit_transform(df['work_type'])
df['Residence_type']= label_encoder.fit_transform(df['Residence_type'])
df['smoking_status']= label_encoder.fit_transform(df['smoking_status'])

df['gender'].unique()
df['ever_married'].unique()
df['work_type'].unique()
df['Residence_type'].unique()
df['smoking_status'].unique()
Out[67]:
array([1, 2, 3, 0])
In [68]:
X = df.drop('stroke', axis =1).values
y = df.stroke.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)
nl = Normalizer()
nl.fit(X_train)
X_train = nl.transform(X_train)
X_dev, X_test, y_dev, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=2)
X_dev = nl.transform(X_dev)
X_test = nl.transform(X_test)
In [69]:
def dnn():
    inputs = Input(name='inputs', shape=[X_train.shape[1],])
    layer = Dense(128, name='FC1')(inputs)
    layer = BatchNormalization(name='BC1')(layer)
    layer = Activation('relu', name='Activation1')(layer)
    layer = Dropout(0.3, name='Dropout1')(layer)
    layer = Dense(128, name='FC2')(layer)
    layer = BatchNormalization(name='BC2')(layer)
    layer = Activation('relu', name='Activation2')(layer)
    layer = Dropout(0.3, name='Dropout2')(layer)
    layer = Dense(128, name='FC3')(layer)
    layer = BatchNormalization(name='BC3')(layer)
    layer = Dropout(0.3, name='Dropout3')(layer)
    layer = Dense(1, name='OutLayer')(layer)
    layer = Activation('sigmoid', name='sigmoid')(layer)
    model = Model(inputs=inputs, outputs=layer)
    return model
In [70]:
model = dnn()
model.summary()
Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
inputs (InputLayer)          [(None, 10)]              0         
_________________________________________________________________
FC1 (Dense)                  (None, 128)               1408      
_________________________________________________________________
BC1 (BatchNormalization)     (None, 128)               512       
_________________________________________________________________
Activation1 (Activation)     (None, 128)               0         
_________________________________________________________________
Dropout1 (Dropout)           (None, 128)               0         
_________________________________________________________________
FC2 (Dense)                  (None, 128)               16512     
_________________________________________________________________
BC2 (BatchNormalization)     (None, 128)               512       
_________________________________________________________________
Activation2 (Activation)     (None, 128)               0         
_________________________________________________________________
Dropout2 (Dropout)           (None, 128)               0         
_________________________________________________________________
FC3 (Dense)                  (None, 128)               16512     
_________________________________________________________________
BC3 (BatchNormalization)     (None, 128)               512       
_________________________________________________________________
Dropout3 (Dropout)           (None, 128)               0         
_________________________________________________________________
OutLayer (Dense)             (None, 1)                 129       
_________________________________________________________________
sigmoid (Activation)         (None, 1)                 0         
=================================================================
Total params: 36,097
Trainable params: 35,329
Non-trainable params: 768
_________________________________________________________________

Bayesian Optimization

In [71]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
In [72]:
reduce_lr = ReduceLROnPlateau()
early_stopping = EarlyStopping(patience=20, min_delta=0.0001)
In [73]:
model.fit(x=X_train, y=y_train, epochs=200, validation_data=(X_dev, y_dev), callbacks=[reduce_lr, early_stopping], verbose=0)
WARNING:tensorflow:AutoGraph could not transform <function Model.make_train_function.<locals>.train_function at 0x0000026C1C658CA8> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
To silence this warning, decorate the function with @tf.autograph.experimental.do_not_convert
WARNING: AutoGraph could not transform <function Model.make_train_function.<locals>.train_function at 0x0000026C1C658CA8> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
To silence this warning, decorate the function with @tf.autograph.experimental.do_not_convert
WARNING:tensorflow:AutoGraph could not transform <function Model.make_test_function.<locals>.test_function at 0x0000026C404D41F8> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
To silence this warning, decorate the function with @tf.autograph.experimental.do_not_convert
WARNING: AutoGraph could not transform <function Model.make_test_function.<locals>.test_function at 0x0000026C404D41F8> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
To silence this warning, decorate the function with @tf.autograph.experimental.do_not_convert
Out[73]:
<tensorflow.python.keras.callbacks.History at 0x26c1c6643c8>
In [74]:
x_lst = [X_train, X_dev, X_test]
y_lst = [y_train, y_dev, y_test]
for i,(x,y) in enumerate(zip(x_lst, y_lst)):
    y_pred = model.predict(x)
    y_pred = np.around(y_pred)
    y_pred = np.asarray(y_pred)
WARNING:tensorflow:AutoGraph could not transform <function Model.make_predict_function.<locals>.predict_function at 0x0000026C1C80CD38> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
To silence this warning, decorate the function with @tf.autograph.experimental.do_not_convert
WARNING: AutoGraph could not transform <function Model.make_predict_function.<locals>.predict_function at 0x0000026C1C80CD38> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
To silence this warning, decorate the function with @tf.autograph.experimental.do_not_convert
In [75]:
dnn_acc = accuracy_score(y, y_pred)
In [76]:
dnn_f1 = f1_score(y, y_pred)
dnn_rec = recall_score(y, y_pred)
dnn_prec = precision_score(y, y_pred)

fpr, tpr, thresholds = metrics.roc_curve(y, y_pred, pos_label=2)
dnn_auc = metrics.auc(fpr, tpr)

For CNN + LSTM

In [77]:
from keras.utils.np_utils import to_categorical
from sklearn.utils import class_weight
from sklearn.metrics import log_loss
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
In [78]:
X = df.drop('stroke', axis =1)
y = df.stroke
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)
In [79]:
X_train = X_train.values
X_test = X_test.values

X_train = X_train.reshape(-1, X_train.shape[1],1)
X_test = X_test.reshape(-1, X_test.shape[1],1)
In [80]:
Y_train = to_categorical(y_train)
Y_test = to_categorical(y_test)
In [81]:
def showResults(test, pred):
    #target_names = ['positive', 'negative']
    # print(classification_report(test, pred, target_names=target_names))
    accuracy = accuracy_score(test, pred)
    precision=precision_score(test, pred, average='weighted')
    f1Score=f1_score(test, pred, average='weighted') 
    #loss=log_loss(test,pred)
    print("Accuracy  : {}".format(accuracy))
    print("Precision : {}".format(precision))
    print("f1Score : {}".format(f1Score))
    #print("Loss : {}".format(loss))
    cm=confusion_matrix(test, pred)
    print(cm)

SGD Optimization

In [82]:
import tensorflow as tf
tf.keras.backend.clear_session()

model = tf.keras.models.Sequential([tf.keras.layers.Conv1D(filters=64,kernel_size=5,strides=1,padding="causal",activation="relu",input_shape=(X_train.shape[1],X_train.shape[2])),
    tf.keras.layers.MaxPooling1D(pool_size=2, strides=1, padding="valid"),
    tf.keras.layers.Conv1D(filters=32, kernel_size=3, strides=1, padding="causal", activation="relu"),
    tf.keras.layers.MaxPooling1D(pool_size=2, strides=1, padding="valid"),
    tf.keras.layers.LSTM(128, return_sequences=True),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(2)
])

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(5e-4,
                                                             decay_steps=1000000,
                                                             decay_rate=0.98,
                                                             staircase=False)

model.compile(loss=tf.keras.losses.MeanSquaredError(),
              optimizer=tf.keras.optimizers.SGD(learning_rate=lr_schedule, momentum=0.8),
              metrics=['acc'])
model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
conv1d (Conv1D)              (None, 10, 64)            384       
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 9, 64)             0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 9, 32)             6176      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 8, 32)             0         
_________________________________________________________________
lstm (LSTM)                  (None, 8, 128)            82432     
_________________________________________________________________
flatten (Flatten)            (None, 1024)              0         
_________________________________________________________________
dense (Dense)                (None, 128)               131200    
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                4128      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 66        
=================================================================
Total params: 224,386
Trainable params: 224,386
Non-trainable params: 0
_________________________________________________________________
In [83]:
history = model.fit(X_train, Y_train,epochs=100,steps_per_epoch=200,validation_steps=200)
Epoch 1/100
WARNING:tensorflow:AutoGraph could not transform <function Model.make_train_function.<locals>.train_function at 0x0000026C49526B88> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
To silence this warning, decorate the function with @tf.autograph.experimental.do_not_convert
WARNING: AutoGraph could not transform <function Model.make_train_function.<locals>.train_function at 0x0000026C49526B88> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
To silence this warning, decorate the function with @tf.autograph.experimental.do_not_convert
200/200 [==============================] - 1s 3ms/step - loss: 0.1836 - acc: 0.9485
Epoch 2/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0548 - acc: 0.9513
Epoch 3/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0526 - acc: 0.9510
Epoch 4/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0513 - acc: 0.9519
Epoch 5/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0518 - acc: 0.9513
Epoch 6/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0516 - acc: 0.9510
Epoch 7/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0501 - acc: 0.9522
Epoch 8/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0522 - acc: 0.9499
Epoch 9/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0506 - acc: 0.9519
Epoch 10/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0485 - acc: 0.9527
Epoch 11/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0516 - acc: 0.9505
Epoch 12/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0520 - acc: 0.9497
Epoch 13/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0499 - acc: 0.9519
Epoch 14/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0512 - acc: 0.9505
Epoch 15/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0486 - acc: 0.9530
Epoch 16/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0498 - acc: 0.9519
Epoch 17/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0497 - acc: 0.9519
Epoch 18/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0511 - acc: 0.9494
Epoch 19/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0491 - acc: 0.9519
Epoch 20/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0515 - acc: 0.9494A: 0s - loss: 0.0497 - ac
Epoch 21/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0489 - acc: 0.9524
Epoch 22/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0485 - acc: 0.9524
Epoch 23/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0514 - acc: 0.9494
Epoch 24/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0462 - acc: 0.9552
Epoch 25/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0520 - acc: 0.9483
Epoch 26/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0469 - acc: 0.9535
Epoch 27/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0489 - acc: 0.9516
Epoch 28/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0501 - acc: 0.9499
Epoch 29/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0487 - acc: 0.9516
Epoch 30/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0485 - acc: 0.9522
Epoch 31/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0503 - acc: 0.9494
Epoch 32/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0474 - acc: 0.9530
Epoch 33/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0479 - acc: 0.9522
Epoch 34/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0510 - acc: 0.9491
Epoch 35/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0503 - acc: 0.9497
Epoch 36/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0466 - acc: 0.9533
Epoch 37/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0479 - acc: 0.9524
Epoch 38/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0488 - acc: 0.9508
Epoch 39/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0501 - acc: 0.9494
Epoch 40/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0459 - acc: 0.9541
Epoch 41/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0484 - acc: 0.9510
Epoch 42/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0484 - acc: 0.9510
Epoch 43/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0474 - acc: 0.9527
Epoch 44/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0510 - acc: 0.9483
Epoch 45/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0466 - acc: 0.9530
Epoch 46/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0455 - acc: 0.9544A: 0s - loss: 0.0498 - ac
Epoch 47/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0532 - acc: 0.9458
Epoch 48/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0464 - acc: 0.9527
Epoch 49/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0465 - acc: 0.9530
Epoch 50/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0503 - acc: 0.9494
Epoch 51/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0477 - acc: 0.9522
Epoch 52/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0464 - acc: 0.9533
Epoch 53/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0502 - acc: 0.9485
Epoch 54/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0485 - acc: 0.9510
Epoch 55/100
200/200 [==============================] - ETA: 0s - loss: 0.0463 - acc: 0.953 - 1s 3ms/step - loss: 0.0468 - acc: 0.9530
Epoch 56/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0470 - acc: 0.9524
Epoch 57/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0473 - acc: 0.9516
Epoch 58/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0464 - acc: 0.9530
Epoch 59/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0519 - acc: 0.9469
Epoch 60/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0478 - acc: 0.9513
Epoch 61/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0464 - acc: 0.9533
Epoch 62/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0470 - acc: 0.9519
Epoch 63/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0484 - acc: 0.9508A: 0s - loss: 0.0506 - a
Epoch 64/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0444 - acc: 0.9547
Epoch 65/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0544 - acc: 0.9444
Epoch 66/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0443 - acc: 0.9552
Epoch 67/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0461 - acc: 0.9530
Epoch 68/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0516 - acc: 0.9466
Epoch 69/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0438 - acc: 0.9555
Epoch 70/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0487 - acc: 0.9502
Epoch 71/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0481 - acc: 0.9505
Epoch 72/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0456 - acc: 0.9535
Epoch 73/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0456 - acc: 0.9533
Epoch 74/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0480 - acc: 0.9508
Epoch 75/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0479 - acc: 0.9510
Epoch 76/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0542 - acc: 0.9438
Epoch 77/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0439 - acc: 0.9552
Epoch 78/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0442 - acc: 0.9552
Epoch 79/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0497 - acc: 0.9488
Epoch 80/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0500 - acc: 0.9488
Epoch 81/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0425 - acc: 0.9566
Epoch 82/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0489 - acc: 0.9497
Epoch 83/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0476 - acc: 0.9513
Epoch 84/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0496 - acc: 0.9488
Epoch 85/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0454 - acc: 0.9535
Epoch 86/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0481 - acc: 0.9505
Epoch 87/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0495 - acc: 0.9491
Epoch 88/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0432 - acc: 0.9558
Epoch 89/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0497 - acc: 0.9485
Epoch 90/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0466 - acc: 0.9522
Epoch 91/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0446 - acc: 0.9541
Epoch 92/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0507 - acc: 0.9477
Epoch 93/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0467 - acc: 0.9519
Epoch 94/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0475 - acc: 0.9510
Epoch 95/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0461 - acc: 0.9527
Epoch 96/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0476 - acc: 0.9508
Epoch 97/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0474 - acc: 0.9510
Epoch 98/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0459 - acc: 0.9527
Epoch 99/100
200/200 [==============================] - 1s 3ms/step - loss: 0.0451 - acc: 0.9535
Epoch 100/100
 89/200 [============>.................] - ETA: 0s - loss: 0.0528 - acc: 0.9451WARNING:tensorflow:Your input ran out of data; interrupting training. Make sure that your dataset or generator can generate at least `steps_per_epoch * epochs` batches (in this case, 20000 batches). You may need to use the repeat() function when building your dataset.
100/200 [==============>...............] - 0s 3ms/step - loss: 0.0536 - acc: 0.9443
In [84]:
# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.savefig('loss.png', format='png', dpi=1200)
plt.show()


# Plot training & validation accuracy values
plt.plot(history.history['acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.savefig('accuracy.png', format='png', dpi=1200)
plt.show()
In [85]:
predictions = model.predict(X_test, verbose=1)
WARNING:tensorflow:AutoGraph could not transform <function Model.make_predict_function.<locals>.predict_function at 0x0000026C2BC1AC18> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
To silence this warning, decorate the function with @tf.autograph.experimental.do_not_convert
WARNING: AutoGraph could not transform <function Model.make_predict_function.<locals>.predict_function at 0x0000026C2BC1AC18> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
To silence this warning, decorate the function with @tf.autograph.experimental.do_not_convert
48/48 [==============================] - 0s 2ms/step
In [86]:
predictcv=np.argmax(predictions,axis=1)
actual_valuecv=np.argmax(Y_test,axis=1)
In [87]:
ens_acc = accuracy_score(actual_valuecv, predictcv)
ens_f1 = f1_score(actual_valuecv, predictcv)
ens_rec = recall_score(actual_valuecv, predictcv)
ens_prec = precision_score(actual_valuecv, predictcv)

fpr, tpr, thresholds = metrics.roc_curve(actual_valuecv, predictcv, pos_label=2)
ens_auc = metrics.auc(fpr, tpr)

For Machine Learning

In [88]:
X
Out[88]:
gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status
0 1 67.0 0 1 1 2 1 228.69 36.600000 1
1 0 61.0 0 0 1 3 0 202.21 28.893237 2
2 1 80.0 0 1 1 2 0 105.92 32.500000 2
3 0 49.0 0 0 1 2 1 171.23 34.400000 3
4 0 79.0 1 0 1 3 0 174.12 24.000000 2
... ... ... ... ... ... ... ... ... ... ...
5105 0 80.0 1 0 1 2 1 83.75 28.893237 2
5106 0 81.0 0 0 1 3 1 125.20 40.000000 2
5107 0 35.0 0 0 1 3 0 82.99 30.600000 2
5108 1 51.0 0 0 1 2 0 166.29 25.600000 1
5109 0 44.0 0 0 1 0 1 85.28 26.200000 0

5110 rows × 10 columns

In [89]:
y
Out[89]:
0       1
1       1
2       1
3       1
4       1
       ..
5105    0
5106    0
5107    0
5108    0
5109    0
Name: stroke, Length: 5110, dtype: int64

Support Vector Machine

In [90]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
SVM = SVC()
SVM.fit(X, y)
predictions = SVM.predict(X)
val1 = (accuracy_score(y, predictions)*100)
print("*Accuracy score for SVM: ", val1, "\n")
print("*Confusion Matrix for SVM: ")
print(confusion_matrix(y, predictions))
print("*Classification Report for SVM: ")
print(classification_report(y, predictions))
*Accuracy score for SVM:  95.12720156555773 

*Confusion Matrix for SVM: 
[[4861    0]
 [ 249    0]]
*Classification Report for SVM: 
              precision    recall  f1-score   support

           0       0.95      1.00      0.98      4861
           1       0.00      0.00      0.00       249

    accuracy                           0.95      5110
   macro avg       0.48      0.50      0.49      5110
weighted avg       0.90      0.95      0.93      5110

In [91]:
svm_f1 = f1_score(y, predictions)
svm_rec = recall_score(y, predictions)
svm_prec = precision_score(y, predictions)

fpr, tpr, thresholds = metrics.roc_curve(y, predictions, pos_label=2)
svm_auc = metrics.auc(fpr, tpr)

Decision Tree

In [92]:
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier(max_depth =3, random_state = 42)
DT.fit(X, y)
predictions = DT.predict(X)
val2 = (accuracy_score(y, predictions)*100)
print("*Accuracy score for DT: ", val2, "\n")
print("*Confusion Matrix for DT: ")
print(confusion_matrix(y, predictions))
print("*Classification Report for DT: ")
print(classification_report(y, predictions))
*Accuracy score for DT:  95.14677103718199 

*Confusion Matrix for DT: 
[[4858    3]
 [ 245    4]]
*Classification Report for DT: 
              precision    recall  f1-score   support

           0       0.95      1.00      0.98      4861
           1       0.57      0.02      0.03       249

    accuracy                           0.95      5110
   macro avg       0.76      0.51      0.50      5110
weighted avg       0.93      0.95      0.93      5110

In [93]:
dt_f1 = f1_score(y, predictions)
dt_rec = recall_score(y, predictions)
dt_prec = precision_score(y, predictions)

fpr, tpr, thresholds = metrics.roc_curve(y, predictions, pos_label=2)
dt_auc = metrics.auc(fpr, tpr)

Random Forest

In [94]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier()
RF.fit(X, y)
predictions = RF.predict(X)
val3 = (accuracy_score(y, predictions)*100)
print("*Accuracy score for RF: ", val3, "\n")
print("*Confusion Matrix for RF: ")
print(confusion_matrix(y, predictions))
print("*Classification Report for RF: ")
print(classification_report(y, predictions))
*Accuracy score for RF:  100.0 

*Confusion Matrix for RF: 
[[4861    0]
 [   0  249]]
*Classification Report for RF: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4861
           1       1.00      1.00      1.00       249

    accuracy                           1.00      5110
   macro avg       1.00      1.00      1.00      5110
weighted avg       1.00      1.00      1.00      5110

In [95]:
rf_f1 = f1_score(y, predictions)
rf_rec = recall_score(y, predictions)
rf_prec = precision_score(y, predictions)

fpr, tpr, thresholds = metrics.roc_curve(y, predictions, pos_label=2)
rf_auc = metrics.auc(fpr, tpr)

Ensemble Method

In [96]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
clf1 = GaussianNB()
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = DecisionTreeClassifier()
eclf1 = VotingClassifier(estimators=[('nb', clf1), ('rf', clf2), ('dt', clf3)], voting='hard')
eclf1.fit(X, y)
predictions = eclf1.predict(X)
print("*Confusion Matrix for Voting Classifier: ")
print(confusion_matrix(y, predictions))
*Confusion Matrix for Voting Classifier: 
[[4861    0]
 [   4  245]]
In [97]:
val4 = (accuracy_score(y, predictions)*100)
print("*Accuracy score for Voting: ", val4, "\n")
print("*Classification Report for Voting: ")
print(classification_report(y, predictions))
*Accuracy score for Voting:  99.92172211350294 

*Classification Report for Voting: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4861
           1       1.00      0.98      0.99       249

    accuracy                           1.00      5110
   macro avg       1.00      0.99      1.00      5110
weighted avg       1.00      1.00      1.00      5110

In [98]:
vot_f1 = f1_score(y, predictions)
vot_rec = recall_score(y, predictions)
vot_prec = precision_score(y, predictions)

fpr, tpr, thresholds = metrics.roc_curve(y, predictions, pos_label=2)
vot_auc = metrics.auc(fpr, tpr)

Comparison

In [99]:
score = [val1,val2,val3,val4,ens_acc*100,dnn_acc*100,hdtl_acc*100,hdtl1_acc*100]
In [100]:
#make variabel for save the result and to show it
classifier = ('SVM','DT','RF','Voting','CNN+LSTM-RO','DNN-BO','HDTL-BO','HDTL-RO')
y_pos = np.arange(len(classifier))
print(y_pos)
print(score)
[0 1 2 3 4 5 6 7]
[95.12720156555773, 95.14677103718199, 100.0, 99.92172211350294, 95.10763209393346, 95.30638852672752, 84.5, 83.0]
In [101]:
import matplotlib.pyplot as plt2
plt2.barh(y_pos, score, align='center', alpha=0.5,color='blue')
plt2.yticks(y_pos, classifier)
plt2.xlabel('Score')
plt2.title('Classification Performance')
plt2.show()
In [102]:
N = 8
ind = np.arange(N)  # the x locations for the groups
width = 0.2      # the width of the bars

fig = plt.figure()
ax = fig.add_subplot(111)

yvals = [svm_prec*100,dt_prec*100,rf_prec*100,vot_prec*100,ens_prec*100,dnn_prec*100,hdtl_prec*100,hdtl1_prec*100]

rects1 = ax.bar(ind, yvals, width, color='r')
zvals = [svm_rec*100,dt_rec*100,rf_rec*100,vot_rec*100,ens_rec*100,dnn_rec*100,hdtl_rec*100,hdtl1_rec*100]
rects2 = ax.bar(ind+width, zvals, width, color='g')
kvals = [svm_f1*100,dt_f1*100,rf_f1*100,vot_f1*100,ens_f1*100,dnn_f1*100,hdtl_f1*100,hdtl1_f1*100]
rects3 = ax.bar(ind+width*2, kvals, width, color='b')

ax.set_ylabel('Scores')
ax.set_xticks(ind+width)
ax.set_xticklabels( ('SVM','DT','RF','Voting','CNN+LSTM-RO','DNN-BO','HDTL-BO','HDTL-RO') )
ax.legend( (rects1[0], rects2[0], rects3[0]), ('Precision', 'Recall', 'F1-SCore') )

def autolabel(rects):
    for rect in rects:
        h = rect.get_height()
        ax.text(rect.get_x()+rect.get_width()/2., 1.05*h, '%d'%int(h),
                ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)
autolabel(rects3)

plt.show()
In [103]:
import joblib
filename = 'model.sav'
joblib.dump(eclf1, filename)
Out[103]:
['model.sav']
In [ ]: